library(tidyverse)
library(caret)
library(readr)

#set working directory and randomization seed
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
set.seed(65)

#load in FL data
ncdata <- read_csv("data/nc_ready_for_analysis.csv")

#clean NC data to have same variables as model
ncdata$party <- factor(ncdata$party)
ncdata <- mutate(ncdata, pred.whi = round(pred.whi, 2),
                   pred.bla = round(pred.bla, 2),
                   pred.lat = round(pred.his, 2),
                   pred.asi = round(pred.asi, 2),
                   pred.oth = round(pred.oth, 2),
                   donations.per.cap = round(donations.per.cap, 0),
                   houseprice_rounded = round(median.house.price, -4))

#subset the data to just the necessary variables
ncdata_sub <- select(ncdata, StateVoterID, sr.race, party, sex, age, pred.whi, pred.bla, pred.lat, pred.asi, pred.oth, income_rounded, college_rounded, pred.vote, zip.population, donations.per.cap, donor, cvap.pct.white, cvap.pct.asian, cvap.pct.black, cvap.pct.latino, homeowner_rounded, houseprice_rounded)

#Remove all observations with missing data (because random forests don't work with missing data)
ncdata_sub <- na.omit(ncdata_sub)
#drop "Unknown" race as missing data
ncdata_sub <- filter(ncdata_sub, sr.race != "Unknown")

#make the outcome variable a factor
ncdata_sub$sr.race <- factor(ncdata_sub$sr.race)

#load in the model object
load("random_forest_model_object.Rdata")

#Predict the NC values
ncdata_sub$rf.race <- predict(fit_classweights, newdata = ncdata_sub)

#save NC out with predicted values
write_csv(ncdata_sub, file = "data/ncdata_clean_predicted.csv")
